#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
将文本整合到 train、test、val 三个文件中
"""

import os
import pandas as pd
# 获取类别
def getCategories():
    df=pd.read_excel(r'D:\anyou\minshi\minshi_labels_9000_whole-1.xls',sheetname='Sheet1',header=None)
    folder=list(df.iloc[:,0])
    labelList=[]
    for name in folder:
        labelname=str(name).replace(".","/")
        label=labelname.split("/")[0]
        labelList.append(label)
    return labelList
def _read_file(filename):
    """读取一个文件并转换为一行"""
    with open(filename, 'r', encoding='utf-8') as f:
        return f.read().replace('\n', '').replace('\t', '').replace('\u3000', '')

def save_file(dirname):
    """
    将多个文件整合并存到3个文件中
    dirname: 原数据目录
    文件内容格式:  类别\t内容
    """
    f_train = open(r'D:\anyou\minshi\cnn\0322/train.txt', 'w', encoding='utf-8')
    f_test = open(r'D:\anyou\minshi\cnn\0322/test.txt', 'w', encoding='utf-8')
    f_val = open(r'D:\anyou\minshi\cnn\0322/val.txt', 'w', encoding='utf-8')
    labelList=getCategories()
    for category in labelList:   # 分类目录
        cat_dir = os.path.join(dirname, category+'.txt')
        count = 0
        with open(cat_dir,encoding='UTF-8') as f:
            contents=f.readlines()
        for content in contents:
            if count < 1000:
                f_train.write(category + '\t' + content + '\n')
                #f_test.write(category + '\t' + content + '\n')
            elif count < 1500:
                f_test.write(category + '\t' + content + '\n')
            elif count<2000:
                f_val.write(category + '\t' + content + '\n')
            count += 1

        print('Finished:', category)

    f_train.close()
    f_test.close()
    f_val.close()


if __name__ == '__main__':
    save_file(r'D:\anyou\minshi\src/')
    print(len(open(r'D:\anyou\minshi\cnn\0322/train.txt', 'r', encoding='utf-8').readlines()))
    print(len(open(r'D:\anyou\minshi\cnn\0322/test.txt', 'r', encoding='utf-8').readlines()))
    print(len(open(r'D:\anyou\minshi\cnn\0322/val.txt', 'r', encoding='utf-8').readlines()))
